library(ggplot2)
library(ggrepel)
library(ggpubr)
library(stringr)
library(MASS)
library(RColorBrewer)
library(DESeq2)
library(viridis)
library(ggpointdensity)
library(dplyr)
library(data.table)
library(ComplexHeatmap)
library(UpSetR)
library(readxl)
theme_set(theme_classic())
#dat_path = "~/research/data/MorPhiC/December-2023/JAX_RNAseq_ExtraEmbryonic/"
dat_path = "../../MorPhiC/data/December-2023/JAX_RNAseq_ExtraEmbryonic/"
dat_path = paste0(dat_path, "processed/Tables")meta_flat = read_excel("data/JAX_RNAseq_ExtraEmbryonic_meta_falttened.xlsx")
meta_flat = as.data.frame(meta_flat)
dim(meta_flat)## [1] 90 37
meta_flat[1:2,]## Dataset
## 1 JAX; CRISPR-Cas9 KO; Primitive Syncytium, Extra-embryonic mesoderm; Bulk RNA-seq; 10X CRISPR; Illumina
## 2 JAX; CRISPR-Cas9 KO; Primitive Syncytium, Extra-embryonic mesoderm; Bulk RNA-seq; 10X CRISPR; Illumina
## Counts file
## 1 PPARG_KO_H02_GT23-11450_GCGCAAGC-CGGCGTGA_S40_L002
## 2 PPARG_KO_G02_GT23-11449_ATATGGAT-TAATACAG_S50_L002
## Sequence file read 1
## 1 PPARG_KO_H02_GT23-11450_GCGCAAGC-CGGCGTGA_S40_L002_R1_001.fastq.gz
## 2 PPARG_KO_G02_GT23-11449_ATATGGAT-TAATACAG_S50_L002_R1_001.fastq.gz
## Sequence file read 2
## 1 PPARG_KO_H02_GT23-11450_GCGCAAGC-CGGCGTGA_S40_L002_R2_001.fastq.gz
## 2 PPARG_KO_G02_GT23-11449_ATATGGAT-TAATACAG_S50_L002_R2_001.fastq.gz
## INPUT LIBRARY PREPARATION ID RUN ID
## 1 GT23-11450 20230918_23-robson-010
## 2 GT23-11449 20230918_23-robson-010
## LIBRARY AVERAGE FRAGMENT SIZE LIBRARY INPUT AMOUNT VALUE
## 1 406 300
## 2 402 300
## LIBRARY INPUT AMOUNT UNIT LIBRARY FINAL YIELD VALUE LIBRARY FINAL YIELD UNIT
## 1 ngs 168.8 ngs
## 2 ngs 199.6 ngs
## LIBRARY CONCENTRATION VALUE LIBRARY CONCENTRATION UNIT LIBRARY PCR CYCLES
## 1 47.2 nM 10
## 2 66.8 nM 10
## LIBRARY PCR CYCLES FOR SAMPLE INDEX DIFFERENTIATED CELL LINE ID
## 1 NA PrS-MOK20026W-H02
## 2 NA PrS-MOK20026W-G02
## DIFFERENTIATED CELL LINE DESCRIPTION
## 1 KOLF2.2 deleted PPARG by deletion of full coding region (KO) derived primitive syncytium
## 2 KOLF2.2 deleted PPARG by deletion of full coding region (KO) derived primitive syncytium
## TIMEPOINT TIME-POINT UNIT TERMINALLY DIFFERENTIATED?
## 1 6 days Yes
## 2 6 days Yes
## Model System INPUT CELL LINE ID CLONE ID
## 1 extra-embryonic primitive syncytial cells MOK20026W-H02 H02
## 2 extra-embryonic primitive syncytial cells MOK20026W-G02 G02
## CELL LINE ID CELL LINE TYPE DERIVED FROM CELL LINE NAME
## 1 MOK20026W-H02 iPSC KOLF2.2J
## 2 MOK20026W-G02 iPSC KOLF2.2J
## CELL LINE DESCRIPTION
## 1 KOLF2.2 deleted PPARG by deletion of full coding region (KO)
## 2 KOLF2.2 deleted PPARG by deletion of full coding region (KO)
## ALTERED GENE SYMBOLS ALTERATION TYPE TARGETED GENOMIC REGION
## 1 PPARG deletion full coding region (KO)
## 2 PPARG deletion full coding region (KO)
## TARGETED GENOMIC REGION COORDINATES GUIDE SEQUENCE
## 1 chr3:12379702-12417187 CAACCATGGTCATTTCTGAA;GATCTTCTATGAAAGAGGGT
## 2 chr3:12379702-12417187 CAACCATGGTCATTTCTGAA;GATCTTCTATGAAAGAGGGT
## ZYGOSITY GENE EXPRESSION ALTERATION PROTOCOL ID
## 1 N.A JAXPE001
## 2 N.A JAXPE001
## LIBRARY PREPARATION PROTOCOL ID SEQUENCING PROTOCOL ID
## 1 JAXPL001 JAXPS001
## 2 JAXPL001 JAXPS001
## DIFFERENTIATION PROTOCOL ID
## 1 JAXPD001
## 2 JAXPD001
names(meta_flat)## [1] "Dataset"
## [2] "Counts file"
## [3] "Sequence file read 1"
## [4] "Sequence file read 2"
## [5] "INPUT LIBRARY PREPARATION ID"
## [6] "RUN ID"
## [7] "LIBRARY AVERAGE FRAGMENT SIZE"
## [8] "LIBRARY INPUT AMOUNT VALUE"
## [9] "LIBRARY INPUT AMOUNT UNIT"
## [10] "LIBRARY FINAL YIELD VALUE"
## [11] "LIBRARY FINAL YIELD UNIT"
## [12] "LIBRARY CONCENTRATION VALUE"
## [13] "LIBRARY CONCENTRATION UNIT"
## [14] "LIBRARY PCR CYCLES"
## [15] "LIBRARY PCR CYCLES FOR SAMPLE INDEX"
## [16] "DIFFERENTIATED CELL LINE ID"
## [17] "DIFFERENTIATED CELL LINE DESCRIPTION"
## [18] "TIMEPOINT"
## [19] "TIME-POINT UNIT"
## [20] "TERMINALLY DIFFERENTIATED?"
## [21] "Model System"
## [22] "INPUT CELL LINE ID"
## [23] "CLONE ID"
## [24] "CELL LINE ID"
## [25] "CELL LINE TYPE"
## [26] "DERIVED FROM CELL LINE NAME"
## [27] "CELL LINE DESCRIPTION"
## [28] "ALTERED GENE SYMBOLS"
## [29] "ALTERATION TYPE"
## [30] "TARGETED GENOMIC REGION"
## [31] "TARGETED GENOMIC REGION COORDINATES"
## [32] "GUIDE SEQUENCE"
## [33] "ZYGOSITY"
## [34] "GENE EXPRESSION ALTERATION PROTOCOL ID"
## [35] "LIBRARY PREPARATION PROTOCOL ID"
## [36] "SEQUENCING PROTOCOL ID"
## [37] "DIFFERENTIATION PROTOCOL ID"
meta_flat$`DIFFERENTIATED CELL LINE ID`[1:5]## [1] "PrS-MOK20026W-H02" "PrS-MOK20026W-G02" "PrS-MOK20026W-E02"
## [4] "PrS-MOK20026P-E12" "PrS-MOK20026P-D10"
cell_line_id = str_split(meta_flat$`DIFFERENTIATED CELL LINE ID`, pattern = '-')
table(sapply(cell_line_id, length))##
## 3 4
## 66 24
cell_line_id = lapply(cell_line_id, function(x) { length(x) <- 4; x})
cell_line_id = as.data.frame(do.call(rbind, cell_line_id))
dim(cell_line_id)## [1] 90 4
names(cell_line_id) = c("model_system", "cell_line", "clone_id", "condition")
cell_line_id[1:2,]## model_system cell_line clone_id condition
## 1 PrS MOK20026W H02 <NA>
## 2 PrS MOK20026W G02 <NA>
cell_line_id$condition[which(is.na(cell_line_id$condition))] = "nor"
table(meta_flat$`Model System`, cell_line_id$model_system)##
## ExM PrS
## extra-embryonic mesenchymal cells 12 0
## extra-embryonic primitive syncytial cells 0 78
table(cell_line_id$model_system, cell_line_id$condition, useNA="ifany")##
## hyp nor
## ExM 0 12
## PrS 12 66
run_id and ko_strategy.Run ID “20231004_23-robson-008-run2” is very similar to “20230918_23-robson-008” in experiment. It is merged into “20230918_23-robson-008”.
table(meta_flat$`RUN ID`)##
## 20230809_23-robson-005-run2 20230918_23-robson-008
## 30 23
## 20230918_23-robson-009 20230918_23-robson-010
## 12 12
## 20231004_23-robson-008-run2 20231004_23-robson-011
## 1 12
w2update = which(meta_flat$`RUN ID` == "20231004_23-robson-008-run2")
meta_flat$run_id = meta_flat$`RUN ID`
meta_flat$run_id[w2update] = "20230918_23-robson-008"
table(meta_flat$run_id, cell_line_id$model_system, useNA="ifany")##
## ExM PrS
## 20230809_23-robson-005-run2 0 30
## 20230918_23-robson-008 0 24
## 20230918_23-robson-009 0 12
## 20230918_23-robson-010 0 12
## 20231004_23-robson-011 12 0
table(meta_flat$run_id, cell_line_id$condition, useNA="ifany")##
## hyp nor
## 20230809_23-robson-005-run2 0 30
## 20230918_23-robson-008 12 12
## 20230918_23-robson-009 0 12
## 20230918_23-robson-010 0 12
## 20231004_23-robson-011 0 12
table(meta_flat$`TARGETED GENOMIC REGION`, useNA="ifany")##
## critical exon (CE) full coding region (KO)
## 24 24
## premature termination codon (PTC) <NA>
## 24 18
meta_flat$ko_strategy = str_extract(meta_flat$`TARGETED GENOMIC REGION`,
"\\(\\S+\\)$")
meta_flat$ko_strategy = str_remove_all(meta_flat$ko_strategy, "\\(|\\)")
meta_flat$ko_strategy[which(is.na(meta_flat$ko_strategy))] = "WT"
table(meta_flat$`TARGETED GENOMIC REGION`, meta_flat$ko_strategy,
useNA="ifany")##
## CE KO PTC WT
## critical exon (CE) 24 0 0 0
## full coding region (KO) 0 24 0 0
## premature termination codon (PTC) 0 0 24 0
## <NA> 0 0 0 18
meta_flat$ko_gene = meta_flat$`ALTERED GENE SYMBOLS`
meta_flat$ko_gene[which(is.na(meta_flat$ko_gene))] = "WT"
table(cell_line_id$model_system, meta_flat$ko_gene)##
## EPAS1 FOSB GCM1 GRHL1 ISL1 POU2F3 PPARG WT
## ExM 0 0 0 0 9 0 0 3
## PrS 18 9 9 9 0 9 9 15
table(meta_flat$run_id, meta_flat$ko_gene)##
## EPAS1 FOSB GCM1 GRHL1 ISL1 POU2F3 PPARG WT
## 20230809_23-robson-005-run2 0 9 0 9 0 9 0 3
## 20230918_23-robson-008 18 0 0 0 0 0 0 6
## 20230918_23-robson-009 0 0 9 0 0 0 0 3
## 20230918_23-robson-010 0 0 0 0 0 0 9 3
## 20231004_23-robson-011 0 0 0 0 9 0 0 3
table(meta_flat$ko_strategy, meta_flat$ko_gene)##
## EPAS1 FOSB GCM1 GRHL1 ISL1 POU2F3 PPARG WT
## CE 6 3 3 3 3 3 3 0
## KO 6 3 3 3 3 3 3 0
## PTC 6 3 3 3 3 3 3 0
## WT 0 0 0 0 0 0 0 18
meta = cbind(meta_flat, cell_line_id)
cts = fread(file.path(dat_path, "genesCounts.csv"), data.table = FALSE)
dim(cts)## [1] 38592 91
cts[1:2, c(1:2, (ncol(cts)-1):ncol(cts))]## Name POU2F3_KO_F04_GT23-10504_GGTACCTT-GACGTCTT_S33_L001
## 1 ENSG00000268674 0
## 2 ENSG00000271254 1030
## PTC_A09__Hypoxia_GT23-11309_AACGTTCC-AGTACTCC_S30_L001
## 1 0
## 2 489
## PTC_D10__Hypoxia_GT23-11310_GCAGAATT-TGGCCGGT_S22_L001
## 1 0
## 2 603
meta$file_id = meta$`Counts file`
setequal(names(cts)[-1], meta$file_id)## [1] TRUE
meta = meta[match(names(cts)[-1], meta$file_id),]
table(names(cts)[-1] == meta$file_id)##
## TRUE
## 90
cts_dat = data.matrix(cts[,-1])
rownames(cts_dat) = cts$Namegene_anno = fread("data/gencode_v44_primary_assembly_info.tsv")
dim(gene_anno)## [1] 62754 8
gene_anno[1:2,]## geneId chr strand start end ensembl_gene_id
## <char> <char> <char> <int> <int> <char>
## 1: ENSG00000000003.16 chrX - 100627108 100639991 ENSG00000000003
## 2: ENSG00000000005.6 chrX + 100584936 100599885 ENSG00000000005
## hgnc_symbol description
## <char> <char>
## 1: TSPAN6 tetraspanin 6 [Source:HGNC Symbol;Acc:HGNC:11858]
## 2: TNMD tenomodulin [Source:HGNC Symbol;Acc:HGNC:17757]
table(rownames(cts_dat) %in% gene_anno$ensembl_gene_id)##
## TRUE
## 38592
# all ensembl gene IDs are contained in the annotation
setdiff(rownames(cts_dat), gene_anno$ensembl_gene_id)## character(0)
model_s = meta$model_system
get_q75 <- function(x){quantile(x, probs = 0.75)}
gene_info = data.frame(Name = cts$Name,
t(apply(cts_dat, 1, tapply, model_s, min)),
t(apply(cts_dat, 1, tapply, model_s, median)),
t(apply(cts_dat, 1, tapply, model_s, get_q75)),
t(apply(cts_dat, 1, tapply, model_s, max)))
dim(gene_info)## [1] 38592 9
gene_info[1:2,]## Name ExM PrS ExM.1 PrS.1 ExM.2 PrS.2 ExM.3 PrS.3
## ENSG00000268674 ENSG00000268674 0 0 0.0 0.0 0.00 0 0 3
## ENSG00000271254 ENSG00000271254 634 387 812.5 775.5 848.25 923 948 1169
names(gene_info)[2:9] = paste0(rep(c("ExM_", "PrS_"), times=4),
rep(c("min", "median", "q75", "max"), each=2))
dim(gene_info)## [1] 38592 9
gene_info[1:2,]## Name ExM_min PrS_min ExM_median PrS_median ExM_q75
## ENSG00000268674 ENSG00000268674 0 0 0.0 0.0 0.00
## ENSG00000271254 ENSG00000271254 634 387 812.5 775.5 848.25
## PrS_q75 ExM_max PrS_max
## ENSG00000268674 0 0 3
## ENSG00000271254 923 948 1169
summary(gene_info[,-1])## ExM_min PrS_min ExM_median PrS_median
## Min. : 0.0 Min. : 0.0 Min. : 0.0 Min. : 0.0
## 1st Qu.: 0.0 1st Qu.: 0.0 1st Qu.: 0.0 1st Qu.: 0.0
## Median : 0.0 Median : 0.0 Median : 2.0 Median : 1.0
## Mean : 376.3 Mean : 255.1 Mean : 530.5 Mean : 585.1
## 3rd Qu.: 120.0 3rd Qu.: 60.0 3rd Qu.: 188.5 3rd Qu.: 182.5
## Max. :512761.0 Max. :154723.0 Max. :797550.5 Max. :379480.0
## ExM_q75 PrS_q75 ExM_max PrS_max
## Min. : 0.0 Min. : 0.0 Min. : 0.0 Min. : 0
## 1st Qu.: 0.0 1st Qu.: 0.0 1st Qu.: 0.0 1st Qu.: 1
## Median : 3.0 Median : 3.0 Median : 6.0 Median : 10
## Mean : 608.8 Mean : 712.7 Mean : 751.6 Mean : 1102
## 3rd Qu.: 228.0 3rd Qu.: 233.8 3rd Qu.: 285.0 3rd Qu.: 405
## Max. :886970.0 Max. :456738.2 Max. :928104.0 Max. :801404
table(gene_info$ExM_q75 > 0, gene_info$PrS_q75 > 0)##
## FALSE TRUE
## FALSE 12757 959
## TRUE 2139 22737
w2kp = which(gene_info$ExM_q75 > 0 | gene_info$PrS_q75 > 0)
cts_dat = cts_dat[w2kp,]
dim(cts_dat)## [1] 25835 90
gene_info = gene_info[w2kp,]
meta$read_depth = colSums(cts_dat)/1e6
meta$q75 = apply(cts_dat, 2, quantile, probs = 0.75)
ggplot(meta, aes(x=read_depth, y=q75, color=ko_strategy, shape = model_system)) +
geom_point(size=2, alpha=0.7) + ggtitle("Gene counts") +
scale_shape_manual(values = c(7, 16)) +
xlab("read depth (million)") + ylab("75 percentile of gene expression") +
labs(color = "KO type", shape = "Model") +
scale_color_brewer(palette = "Set1")meta$run_id_short = str_replace(meta$run_id, "^[^-]*-", "")
ggplot(meta, aes(x=read_depth, y=q75, color=run_id_short, shape = condition)) +
geom_point(size=2, alpha=0.7) + ggtitle("Gene counts") +
scale_shape_manual(values = c(7, 16)) +
xlab("read depth (million)") + ylab("75 percentile of gene expression") +
labs(color = "Run ID", shape = "Condition") +
scale_color_brewer(palette = "Set1")sample2kp = which(meta$ko_gene == "WT")
cts_dat_m = cts_dat[,sample2kp]
meta_m = meta[sample2kp,]
summary(meta_m$q75)## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 361.0 422.2 469.8 478.4 540.5 592.0
stopifnot(all(meta_m$file_id == colnames(cts_dat_m)))
q75 = apply(cts_dat_m, 1, quantile, probs=0.75)
cts_dat_n = t(cts_dat_m[q75 > 0,])
cts_dat_n = log(median(meta_m$q75)*cts_dat_n/meta_m$q75 + 1)
dim(cts_dat_n)## [1] 18 23865
sample_pca = prcomp(cts_dat_n, center = TRUE, scale. = TRUE)
names(sample_pca)## [1] "sdev" "rotation" "center" "scale" "x"
pc_scores = sample_pca$x
eigen_vals = sample_pca$sdev^2
eigen_vals[1:5]/sum(eigen_vals)## [1] 0.41336077 0.18367275 0.08208396 0.04567809 0.03608958
pvs = round(eigen_vals[1:5]/sum(eigen_vals)*100,1)
pvs## [1] 41.3 18.4 8.2 4.6 3.6
PC_df = data.frame(pc_scores)
PC_df = cbind(PC_df, meta_m)
gPC = ggplot(PC_df, aes(x = PC1, y = PC2, shape=condition, color=model_system)) +
geom_point(size=2.5, alpha=0.7) + scale_color_brewer(palette = "Dark2") +
scale_shape_manual(values = c(7, 15, 16, 17)) +
labs(color="KO gene", shape ="KO strategy") +
xlab(sprintf("PC1 (%.1f%% variance)", pvs[1])) +
ylab(sprintf("PC2 (%.1f%% variance)", pvs[2])) +
ggtitle("Wild type samples") +
guides(
color = guide_legend(title = NULL, order = 1),
shape = guide_legend(title = NULL, order = 2)
) +
theme(
legend.margin = margin(0, 0, 0, 0),
legend.box.just = "left",
legend.direction = "vertical"
)
print(gPC)gPC = ggplot(PC_df, aes(x = PC1, y = PC2, shape=condition, color=run_id_short)) +
geom_point(size=2.5, alpha=0.7) + scale_color_brewer(palette = "Dark2") +
scale_shape_manual(values = c(7, 15, 16, 17)) +
labs(color="KO gene", shape ="KO strategy") +
xlab(sprintf("PC1 (%.1f%% variance)", pvs[1])) +
ylab(sprintf("PC2 (%.1f%% variance)", pvs[2])) +
ggtitle("Wild type samples") +
guides(
color = guide_legend(title = NULL, order = 1),
shape = guide_legend(title = NULL, order = 2)
) +
theme(
legend.margin = margin(0, 0, 0, 0),
legend.box.just = "left",
legend.direction = "vertical"
)
print(gPC)table(meta_m$run_id, meta_m$model_system)##
## ExM PrS
## 20230809_23-robson-005-run2 0 3
## 20230918_23-robson-008 0 6
## 20230918_23-robson-009 0 3
## 20230918_23-robson-010 0 3
## 20231004_23-robson-011 3 0
## Generate sample information matrix
cols2kp = c("model_system", "condition", "q75")
sample_info = meta_m[,cols2kp]
dim(sample_info)## [1] 18 3
sample_info[1:2,]## model_system condition q75
## 12 PrS nor 569.5
## 68 PrS hyp 545.0
sample_info$log_q75 = log(sample_info$q75)
dds = DESeqDataSetFromMatrix(cts_dat_m, colData=sample_info,
~ model_system + condition + log_q75)
dds = DESeq(dds)
## association with read-depth
res_rd = results(dds, name = "log_q75")
res_rd = as.data.frame(res_rd)
dim(res_rd)## [1] 25835 6
res_rd[1:2,]## baseMean log2FoldChange lfcSE stat pvalue
## ENSG00000271254 731.89279 0.0114876 0.2318513 0.04954726 0.9604832
## ENSG00000277196 46.50349 0.9817827 1.3388605 0.73329722 0.4633772
## padj
## ENSG00000271254 0.9997047
## ENSG00000277196 0.9997047
table(is.na(res_rd$padj))##
## FALSE TRUE
## 17751 8084
g0 = ggplot(res_rd %>% subset(!is.na(padj)), aes(x=pvalue)) +
geom_histogram(color="darkblue", fill="lightblue",
breaks = seq(0,1,by=0.01)) +
ggtitle("Read depth")
print(g0)## association with model_system
dds_lrt = DESeq(dds, test="LRT", reduced = ~ condition + log_q75)
res_lrt = results(dds_lrt)
res_model_system = as.data.frame(res_lrt)
dim(res_model_system)## [1] 25835 6
res_model_system[1:2,]## baseMean log2FoldChange lfcSE stat pvalue
## ENSG00000271254 731.89279 0.0114876 0.2318513 4.06378556 0.04381218
## ENSG00000277196 46.50349 0.9817827 1.3388605 0.04552192 0.83104721
## padj
## ENSG00000271254 0.06908888
## ENSG00000277196 0.87097036
table(is.na(res_model_system$padj))##
## FALSE TRUE
## 23736 2099
g0 = ggplot(res_model_system %>% subset(!is.na(padj)), aes(x=pvalue)) +
geom_histogram(color="darkblue", fill="lightblue",
breaks = seq(0,1,by=0.01)) +
ggtitle("Model system")
print(g0)## association with condition
dds_lrt = DESeq(dds, test="LRT", reduced = ~ model_system + log_q75)
res_lrt = results(dds_lrt)
res_condition = as.data.frame(res_lrt)
dim(res_condition)## [1] 25835 6
res_condition[1:2,]## baseMean log2FoldChange lfcSE stat pvalue
## ENSG00000271254 731.89279 0.0114876 0.2318513 0.2926949 0.58849873
## ENSG00000277196 46.50349 0.9817827 1.3388605 2.9420549 0.08630088
## padj
## ENSG00000271254 0.6952827
## ENSG00000277196 0.1494475
table(is.na(res_condition$padj))##
## FALSE TRUE
## 22739 3096
g0 = ggplot(res_condition %>% subset(!is.na(padj)), aes(x=pvalue)) +
geom_histogram(color="darkblue", fill="lightblue",
breaks = seq(0,1,by=0.01)) +
ggtitle("Condition")
print(g0)For “PrS_nor”, the issue of mismatch among CE, KO and PTC between what is got from the “TARGETED GENOMIC REGION” column in flat metadata file and what is got from the filenames is still not solved, as of September 25, 2024. Currently, run it according to what is got from the “TARGETED GENOMIC REGION” column in flat metadata file. Update it later once the mismatch is solved
Explore the PC plots first, to decide running the models on what level of DE group.
Then run the DE analysis.
meta$model_condition = paste(meta$model_system, meta$condition, sep="_")
table(meta$model_condition)##
## ExM_nor PrS_hyp PrS_nor
## 12 12 66
table(meta$run_id, meta$model_condition)##
## ExM_nor PrS_hyp PrS_nor
## 20230809_23-robson-005-run2 0 0 30
## 20230918_23-robson-008 0 12 12
## 20230918_23-robson-009 0 0 12
## 20230918_23-robson-010 0 0 12
## 20231004_23-robson-011 12 0 0
unique_model_condition_ss = unique(meta$model_condition)
unique_model_condition_ss = sort(unique_model_condition_ss)
for(cur_mc in unique_model_condition_ss){
print(cur_mc)
sample2kp = which(meta$model_condition == cur_mc)
cts_dat_m = cts_dat[,sample2kp]
meta_m = meta[sample2kp,]
print(table(meta_m$ko_strategy, str_extract(colnames(cts_dat_m), "^[^_]+_[^_]+")))
extracted_ko_strings = str_extract_all(colnames(cts_dat_m), "CE|KO|PTC|WT")
print(table(sapply(extracted_ko_strings, length)))
print(table(meta_m$ko_strategy, unlist(extracted_ko_strings)))
if (cur_mc == "PrS_nor"){
print("The overlap between WT and KO is not real overlap. It is due to the gene name contains substring 'KO' in these three samples:")
print(meta_m$file_id[(meta_m$ko_strategy=="WT") & (unlist(extracted_ko_strings)=="KO")])
}
stopifnot(all(meta_m$file_id == colnames(cts_dat_m)))
q75 = apply(cts_dat_m, 1, quantile, probs=0.75)
cts_dat_n = t(cts_dat_m[q75 > 0,])
cts_dat_n = log(median(meta_m$q75)*cts_dat_n/meta_m$q75 + 1)
dim(cts_dat_n)
sample_pca = prcomp(cts_dat_n, center = TRUE, scale. = TRUE)
names(sample_pca)
pc_scores = sample_pca$x
eigen_vals = sample_pca$sdev^2
eigen_vals[1:5]/sum(eigen_vals)
pvs = round(eigen_vals[1:5]/sum(eigen_vals)*100,1)
pvs
PC_df = data.frame(pc_scores)
PC_df = cbind(PC_df, meta_m)
gPC = ggplot(PC_df, aes(x = PC1, y = PC2, shape=ko_strategy, color=ko_gene)) +
geom_point(size=2.5, alpha=0.7) + scale_color_brewer(palette = "Dark2") +
scale_shape_manual(values = c(7, 15, 16, 17)) +
xlab(sprintf("PC1 (%.1f%% variance)", pvs[1])) +
ylab(sprintf("PC2 (%.1f%% variance)", pvs[2])) +
ggtitle(cur_mc) +
guides(
color = guide_legend(title = NULL, order = 1),
shape = guide_legend(title = NULL, order = 2)
) +
theme(
legend.margin = margin(0, 0, 0, 0),
legend.box.just = "left",
legend.direction = "vertical"
)
print(gPC)
gPC = ggplot(PC_df, aes(x = PC1, y = PC2, shape=run_id_short, color=ko_gene)) +
geom_point(size=2.5, alpha=0.7) + scale_color_brewer(palette = "Dark2") +
scale_shape_manual(values = c(7, 15, 16, 17)) +
xlab(sprintf("PC1 (%.1f%% variance)", pvs[1])) +
ylab(sprintf("PC2 (%.1f%% variance)", pvs[2])) +
ggtitle(cur_mc) +
guides(
color = guide_legend(title = NULL, order = 1),
shape = guide_legend(title = NULL, order = 2)
) +
theme(
legend.margin = margin(0, 0, 0, 0),
legend.box.just = "left",
legend.direction = "vertical"
)
print(gPC)
gPC = ggplot(PC_df, aes(x = PC1, y = PC2, shape=run_id_short, color=ko_gene,
alpha = ifelse(ko_gene == "WT", 1, 0.8))) +
geom_point(size=2.5) +
scale_color_brewer(palette = "Dark2") +
scale_shape_manual(values = c(7, 15, 16, 17)) +
xlab(sprintf("PC1 (%.1f%% variance)", pvs[1])) +
ylab(sprintf("PC2 (%.1f%% variance)", pvs[2])) +
ggtitle(cur_mc) + guides(alpha = "none") +
guides(
color = guide_legend(title = NULL, order = 1),
shape = guide_legend(title = NULL, order = 2)
) +
theme(
legend.margin = margin(0, 0, 0, 0),
legend.box.just = "left",
legend.direction = "vertical"
)
print(gPC)
table(meta_m$run_id, meta_m$ko_gene)
}## [1] "ExM_nor"
##
## ISL1_CE ISL1_KO ISL1_PTC ISL1_WT
## CE 3 0 0 0
## KO 0 3 0 0
## PTC 0 0 3 0
## WT 0 0 0 3
##
## 1
## 12
##
## CE KO PTC WT
## CE 3 0 0 0
## KO 0 3 0 0
## PTC 0 0 3 0
## WT 0 0 0 3
## [1] "PrS_hyp"
##
## CE_E06 CE_G07 CE_H06 KO_A03 KO_B01 KO_B02 PTC_A09 PTC_D10 PTC_G09 WT_1
## CE 1 1 1 0 0 0 0 0 0 0
## KO 0 0 0 1 1 1 0 0 0 0
## PTC 0 0 0 0 0 0 1 1 1 0
## WT 0 0 0 0 0 0 0 0 0 1
##
## WT_2 WT_3
## CE 0 0
## KO 0 0
## PTC 0 0
## WT 1 1
##
## 1
## 12
##
## CE KO PTC WT
## CE 3 0 0 0
## KO 0 3 0 0
## PTC 0 0 3 0
## WT 0 0 0 3
## [1] "PrS_nor"
##
## CE_E06 CE_G07 CE_H06 FOSB_CE FOSB_KO FOSB_PTC GCM1_CE GCM1_KO GCM1_PTC
## CE 1 1 1 0 0 3 3 0 0
## KO 0 0 0 3 0 0 0 3 0
## PTC 0 0 0 0 3 0 0 0 3
## WT 0 0 0 0 0 0 0 0 0
##
## GCM1_WT GHRL1_CE GHRL1_KO GHRL1_PTC KO_A03 KO_B01 KO_B02 KOLF2_FOSB
## CE 0 3 0 0 0 0 0 0
## KO 0 0 3 0 1 1 1 0
## PTC 0 0 0 3 0 0 0 0
## WT 3 0 0 0 0 0 0 1
##
## KOLF2_GHRL1 KOLF2_POU2F3 POU2F3_CE POU2F3_KO POU2F3_PTC PPARG_CE PPARG_KO
## CE 0 0 3 0 0 3 0
## KO 0 0 0 3 0 0 3
## PTC 0 0 0 0 3 0 0
## WT 1 1 0 0 0 0 0
##
## PPARG_PTC PPARG_WT PTC_A09 PTC_D10 PTC_G09 WT_1 WT_2 WT_3
## CE 0 0 0 0 0 0 0 0
## KO 0 0 0 0 0 0 0 0
## PTC 3 0 1 1 1 0 0 0
## WT 0 3 0 0 0 1 1 1
##
## 1
## 66
##
## CE KO PTC WT
## CE 15 0 3 0
## KO 3 15 0 0
## PTC 0 3 15 0
## WT 0 3 0 9
## [1] "The overlap between WT and KO is not real overlap. It is due to the gene name contains substring 'KO' in these three samples:"
## [1] "KOLF2_GHRL1_2_GT23-10492_TTACAGGA-GCTTGTCA_S6_L001"
## [2] "KOLF2_FOSB_1_GT23-10493_GACCTGAA-CTCACCAA_S32_L001"
## [3] "KOLF2_POU2F3_1_GT23-10491_CCGTGAAG-ATCCACTG_S38_L001"
Use each combination of (model system, condition) as DE group.
Run analysis for each DE group separately.
For the output files, create two versions, one with direct information, one with padj set to NA for the genes when the number of 0 per test is >=4.
In more details, for both versions of the outputs, make it one file per knockout gene per DE group (DE group can be model system, model system + condition, model system + run_id, model system + day, etc.). The first version of the output files contains gene id, and for each knockout strategy:
baseMean, log2FoldChange, lfcSE, stat, pvalue, padj
The second version of the output files contains gene id,
chr, strand, position, gene name
and for each knockout strategy:
log2FoldChange, pvalue, padj (set to be NA if the number of 0 per test >= 4),
In addition, save out a separate file of the sample size for each comparison. This needs to be by DE group.
df_size = NULL
release_name = "2023_12_JAX_RNAseq_ExtraEmbryonic"
output_dir = file.path("results", release_name)
raw_output_dir = file.path(output_dir, "raw")
processed_output_dir = file.path(output_dir, "processed")
if (!file.exists(raw_output_dir)){
dir.create(raw_output_dir, recursive = TRUE)
}
if (!file.exists(processed_output_dir)){
dir.create(processed_output_dir, recursive = TRUE)
}
unique_model_ss = unique(meta$model_system)
unique_model_ss = sort(unique_model_ss)
unique_model_ss## [1] "ExM" "PrS"
for (model1 in unique_model_ss){
print(model1)
sample2kp = which(meta$model_system == model1)
cts_dat_m = cts_dat[,sample2kp]
meta_m = meta[sample2kp,]
stopifnot(all(meta_m$file_id == colnames(cts_dat_m)))
print(table(meta_m$file_id == colnames(cts_dat_m), useNA="ifany"))
## Generate sample information matrix
cols2kp = c("model_system", "condition", "ko_strategy", "ko_gene", "run_id",
"run_id_short", "q75")
sample_info = meta_m[,cols2kp]
dim(sample_info)
sample_info[1:2,]
print(table(sample_info$ko_strategy, sample_info$ko_gene))
sample_info$ko_group = paste(sample_info$ko_gene,
sample_info$ko_strategy, sep="_")
print(table(sample_info$ko_group))
# use the combination of (model system, condition) as DE group
sample_info$de_grp = paste0(sample_info$model_system, "_",
sample_info$condition)
sorted_de_grps = sort(unique(sample_info$de_grp))
sorted_de_grps
for(d_group in sorted_de_grps){
print(d_group)
dg_index = which(sample_info$de_grp==d_group)
cts_dat_m_dg = cts_dat_m[, dg_index]
sample_info_dg = sample_info[dg_index, ]
stopifnot(all(sample_info_dg$de_grp==d_group))
print("-----------------------------------")
print(sprintf("DE analysis group %s DESeq2 results:", d_group))
print("-----------------------------------")
# do two steps of filtering
# first, filter based on q75 of each gene
# second, filter based on whether each gene is expressed in at least 4 cells
dg_q75 = apply(cts_dat_m_dg, 1, quantile, probs=0.75)
cts_dat_m_dg = cts_dat_m_dg[dg_q75 > 0,]
print(sprintf("After filtering by gene expression q75, the number of genes reduces from %d to %d",
length(dg_q75), nrow(cts_dat_m_dg)))
n_pos = rowSums(cts_dat_m_dg>0)
cts_dat_m_dg = cts_dat_m_dg[n_pos >= 4,]
if (length(n_pos)==nrow(cts_dat_m_dg)){
print("After filtering by nonzero expression in at least 4 samples, the number of genes does not change.")
}else{
print(sprintf("After filtering by nonzero expression in at least 4 samples, the number of genes reduces from %d to %d",
length(n_pos), nrow(cts_dat_m_dg)))
}
# update the q75 based on only genes used in the process
sample_info_dg$q75 = apply(cts_dat_m_dg, 2, quantile, probs = 0.75)
sample_info_dg$log_q75 = log(sample_info_dg$q75)
if (length(unique(sample_info_dg$run_id))==1){
dds = DESeqDataSetFromMatrix(cts_dat_m_dg, colData=sample_info_dg,
~ ko_group + log_q75)
}else{
dds = DESeqDataSetFromMatrix(cts_dat_m_dg, colData=sample_info_dg,
~ ko_group + run_id + log_q75)
}
start.time <- Sys.time()
dds = DESeq(dds)
end.time <- Sys.time()
time.taken <- end.time - start.time
print(time.taken)
## association with read-depth
res_rd = results(dds, name = "log_q75")
res_rd = as.data.frame(res_rd)
dim(res_rd)
res_rd[1:2,]
table(is.na(res_rd$padj))
g0 = ggplot(res_rd %>% subset(!is.na(padj)), aes(x=pvalue)) +
geom_histogram(color="darkblue", fill="lightblue",
breaks = seq(0,1,by=0.01)) +
ggtitle(paste0(d_group, " read depth"))
print(g0)
## Rerun the analysis without read-depth if it is not significant for
## most genes, and the number of samples is small
## i.e., proportion of non-null in the distribution is smaller than 0.01
## (this needs to be restricted to the genes with not NA adjusted pvalue)
## or if smaller than 0.1 and the number of samples is not greater than 6
pi_1_rd = max(0, mean(res_rd$pvalue[!is.na(res_rd$padj)] < 0.05) - 0.05)
pi_1_rd = max(pi_1_rd, 0, 1 - 2*mean(res_rd$pvalue[!is.na(res_rd$padj)] > 0.5))
print(sprintf("prop of non-null for rd: %.2e", pi_1_rd))
if(pi_1_rd < 0.01 || (ncol(cts_dat_m_dg) <= 6 && pi_1_rd < 0.1 )){
print("rerun DESeq2 without read depth")
include_rd = 0
if (length(unique(sample_info_dg$run_id))==1){
dds = DESeqDataSetFromMatrix(cts_dat_m_dg, colData=sample_info_dg,
~ ko_group)
}else{
dds = DESeqDataSetFromMatrix(cts_dat_m_dg, colData=sample_info_dg,
~ ko_group + run_id)
}
start.time <- Sys.time()
dds = DESeq(dds)
end.time <- Sys.time()
time.taken <- end.time - start.time
print(time.taken)
}else{
include_rd = 1
}
## association with run_id
if (length(unique(sample_info_dg$run_id))>1){
if(include_rd){
dds_lrt = DESeq(dds, test="LRT", reduced = ~ ko_group + log_q75)
}else{
dds_lrt = DESeq(dds, test="LRT", reduced = ~ ko_group)
}
res_lrt = results(dds_lrt)
res_run_id = as.data.frame(res_lrt)
dim(res_run_id)
res_run_id[1:2,]
table(is.na(res_run_id$padj))
g0 = ggplot(res_run_id %>% subset(!is.na(padj)), aes(x=pvalue)) +
geom_histogram(color="darkblue", fill="lightblue",
breaks = seq(0,1,by=0.01)) +
ggtitle(paste0(d_group, " Run ID"))
print(g0)
}
## DE analysis for each KO gene
table(sample_info_dg$ko_gene)
table(sample_info_dg$ko_gene, sample_info_dg$ko_strategy)
genos = setdiff(unique(sample_info_dg$ko_gene), "WT")
genos
ko_grp = c("CE", "KO", "PTC")
for(geno in genos){
res_geno_df = NULL
res_geno_reduced_df = NULL
res_geno = list()
for(k_grp1 in ko_grp){
res_g1 = results(dds, contrast = c("ko_group",
paste0(geno, "_", k_grp1), "WT_WT"))
res_g1 = signif(data.frame(res_g1), 3)
# add a column to record the number of zeros per test
test_index = which(sample_info_dg$ko_group%in%c(paste0(geno, "_", k_grp1), "WT_WT"))
cts_dat_m_dg_test = cts_dat_m_dg[, test_index]
n_zero = rowSums(cts_dat_m_dg_test==0)
res_g1$n_zeros = n_zero
# record the number of samples involved in the comparison
test_ko_group_vec = sample_info_dg$ko_group[test_index]
df_size = rbind(df_size,
c(d_group,
paste0(geno, "_", k_grp1),
sum(test_ko_group_vec!="WT_WT"),
sum(test_ko_group_vec=="WT_WT")))
# prepare a processed version of output
res_g1_reduced = res_g1[, c("log2FoldChange", "pvalue", "padj", "n_zeros")]
res_g1_reduced$padj[which(res_g1_reduced$n_zeros>=4)] = NA
if ((sum(test_ko_group_vec!="WT_WT")==1)|(sum(test_ko_group_vec=="WT_WT")==1)){
res_g1_reduced$padj = NA
}
res_g1_reduced = res_g1_reduced[, c("log2FoldChange", "pvalue", "padj")]
res_geno[[k_grp1]] = res_g1_reduced
if ((sum(test_ko_group_vec!="WT_WT")>1) & (sum(test_ko_group_vec=="WT_WT")>1)){
g1 = ggplot(res_g1_reduced %>% subset(!is.na(padj)), aes(x=pvalue)) +
geom_histogram(color="darkblue", fill="lightblue",
breaks = seq(0,1,by=0.01)) +
ggtitle(paste0(d_group, " ", geno, "_", k_grp1))
print(g1)
}
tag1 = sprintf("%s_%s_", geno, k_grp1)
colnames(res_g1) = paste0(tag1, colnames(res_g1))
colnames(res_g1_reduced) = paste0(tag1, colnames(res_g1_reduced))
if(is.null(res_geno_df)){
res_geno_df = res_g1
}else if(!is.null(res_geno_df)){
stopifnot(all(rownames(res_geno_df) == rownames(res_g1)))
res_geno_df = cbind(res_geno_df, res_g1)
}
if(is.null(res_geno_reduced_df)){
res_geno_reduced_df = res_g1_reduced
}else if(!is.null(res_geno_reduced_df)){
stopifnot(all(rownames(res_geno_reduced_df) == rownames(res_g1_reduced)))
res_geno_reduced_df = cbind(res_geno_reduced_df, res_g1_reduced)
}
}
get_index <- function(x){
which(x$padj < 0.05 & abs(x$log2FoldChange) > log2(1.5))
}
w_ce = get_index(res_geno[["CE"]])
w_ko = get_index(res_geno[["KO"]])
w_ptc = get_index(res_geno[["PTC"]])
print(paste0("DE group ", d_group, " under KO gene ", geno, ":"))
print(paste0("number of DE genes from knock out strategy CE: ",
as.character(length(w_ce))))
print(paste0("number of DE genes from knock out strategy KO: ",
as.character(length(w_ko))))
print(paste0("number of DE genes from knock out strategy PTC: ",
as.character(length(w_ptc))))
ce_ko_overlap = length(intersect(rownames(res_geno[["CE"]])[w_ce],
rownames(res_geno[["KO"]])[w_ko]))
ko_ptc_overlap = length(intersect(rownames(res_geno[["KO"]])[w_ko],
rownames(res_geno[["PTC"]])[w_ptc]))
ptc_ce_overlap = length(intersect(rownames(res_geno[["PTC"]])[w_ptc],
rownames(res_geno[["CE"]])[w_ce]))
print(paste0("number of common DE genes by knock out strategies CE and KO: ",
as.character(ce_ko_overlap)))
print(paste0("number of common DE genes by knock out strategies KO and PTC: ",
as.character(ko_ptc_overlap)))
print(paste0("number of common DE genes by knock out strategies PTC and CE: ",
as.character(ptc_ce_overlap)))
if (max(length(w_ce), length(w_ko), length(w_ptc)) > 0){
m = make_comb_mat(list("CE" = rownames(res_geno[["CE"]])[w_ce],
"KO" = rownames(res_geno[["KO"]])[w_ko],
"PTC" = rownames(res_geno[["PTC"]])[w_ptc]))
g_up = UpSet(m)
print(g_up)
}
df1 = data.frame(padj_CE = res_geno[["CE"]]$padj,
padj_KO = res_geno[["KO"]]$padj,
padj_PTC = res_geno[["PTC"]]$padj)
cr1 = cor(df1$padj_PTC, df1$padj_CE, method = "spearman", use="pair")
cr2 = cor(df1$padj_PTC, df1$padj_KO, method = "spearman", use="pair")
g4 = ggplot(data = df1, mapping = aes(x = -log10(padj_PTC),
y = -log10(padj_CE))) +
geom_pointdensity() +
ggtitle(sprintf("%s, %s\nSpearman r = %.2f", d_group, geno, cr1)) +
scale_color_viridis()
g5 = ggplot(data = df1, mapping = aes(x = -log10(padj_PTC),
y = -log10(padj_KO))) +
geom_pointdensity() +
ggtitle(sprintf("%s, %s\nSpearman r = %.2f", d_group, geno, cr2)) +
scale_color_viridis()
print(g4)
print(g5)
dim(res_geno_df)
res_geno_df[1:2,]
dim(res_geno_reduced_df)
res_geno_reduced_df[1:2,]
res_df = data.frame(gene_ID=rownames(res_geno_df),
res_geno_df)
dim(res_df)
res_df[1:2,]
res_reduced_gene_anno = gene_anno[match(rownames(res_geno_reduced_df),
gene_anno$ensembl_gene_id), ]
stopifnot(all(rownames(res_geno_reduced_df)==res_reduced_gene_anno$ensembl_gene_id))
# set gene hgnc_symbol that equal "" to NA
hgnc_symbol_vec = res_reduced_gene_anno$hgnc_symbol
hgnc_symbol_vec[which(hgnc_symbol_vec=="")] = NA
res_reduced_df = data.frame(gene_ID=rownames(res_geno_reduced_df),
hgnc_symbol=hgnc_symbol_vec,
chr=res_reduced_gene_anno$chr,
strand=res_reduced_gene_anno$strand,
start=res_reduced_gene_anno$start,
end=res_reduced_gene_anno$end,
res_geno_reduced_df)
print("hgnc_symbol empty string and NA tables:")
print(table(res_reduced_df$hgnc_symbol=="", useNA="ifany"))
print(table(is.na(res_reduced_df$hgnc_symbol)))
dim(res_reduced_df)
res_reduced_df[1:2,]
fwrite(res_df,
sprintf("%s/%s_%s_%s_DEseq2_raw.tsv",
raw_output_dir, release_name, d_group, geno),
sep="\t")
fwrite(res_reduced_df,
sprintf("%s/%s_%s_%s_DEseq2.tsv",
processed_output_dir, release_name, d_group, geno),
sep="\t")
}
}
}## [1] "ExM"
##
## TRUE
## 12
##
## ISL1 WT
## CE 3 0
## KO 3 0
## PTC 3 0
## WT 0 3
##
## ISL1_CE ISL1_KO ISL1_PTC WT_WT
## 3 3 3 3
## [1] "ExM_nor"
## [1] "-----------------------------------"
## [1] "DE analysis group ExM_nor DESeq2 results:"
## [1] "-----------------------------------"
## [1] "After filtering by gene expression q75, the number of genes reduces from 25835 to 24876"
## [1] "After filtering by nonzero expression in at least 4 samples, the number of genes reduces from 24876 to 23842"
## Time difference of 1.585154 mins
## [1] "prop of non-null for rd: 0.00e+00"
## [1] "rerun DESeq2 without read depth"
## Time difference of 8.388163 secs
## [1] "DE group ExM_nor under KO gene ISL1:"
## [1] "number of DE genes from knock out strategy CE: 652"
## [1] "number of DE genes from knock out strategy KO: 405"
## [1] "number of DE genes from knock out strategy PTC: 432"
## [1] "number of common DE genes by knock out strategies CE and KO: 343"
## [1] "number of common DE genes by knock out strategies KO and PTC: 324"
## [1] "number of common DE genes by knock out strategies PTC and CE: 380"
## [1] "hgnc_symbol empty string and NA tables:"
##
## FALSE <NA>
## 18935 4907
##
## FALSE TRUE
## 18935 4907
## [1] "PrS"
##
## TRUE
## 78
##
## EPAS1 FOSB GCM1 GRHL1 POU2F3 PPARG WT
## CE 6 3 3 3 3 3 0
## KO 6 3 3 3 3 3 0
## PTC 6 3 3 3 3 3 0
## WT 0 0 0 0 0 0 15
##
## EPAS1_CE EPAS1_KO EPAS1_PTC FOSB_CE FOSB_KO FOSB_PTC GCM1_CE
## 6 6 6 3 3 3 3
## GCM1_KO GCM1_PTC GRHL1_CE GRHL1_KO GRHL1_PTC POU2F3_CE POU2F3_KO
## 3 3 3 3 3 3 3
## POU2F3_PTC PPARG_CE PPARG_KO PPARG_PTC WT_WT
## 3 3 3 3 15
## [1] "PrS_hyp"
## [1] "-----------------------------------"
## [1] "DE analysis group PrS_hyp DESeq2 results:"
## [1] "-----------------------------------"
## [1] "After filtering by gene expression q75, the number of genes reduces from 25835 to 23911"
## [1] "After filtering by nonzero expression in at least 4 samples, the number of genes reduces from 23911 to 23326"
## Time difference of 9.515882 secs
## [1] "prop of non-null for rd: 8.49e-02"
## [1] "DE group PrS_hyp under KO gene EPAS1:"
## [1] "number of DE genes from knock out strategy CE: 4196"
## [1] "number of DE genes from knock out strategy KO: 2140"
## [1] "number of DE genes from knock out strategy PTC: 2553"
## [1] "number of common DE genes by knock out strategies CE and KO: 1883"
## [1] "number of common DE genes by knock out strategies KO and PTC: 1628"
## [1] "number of common DE genes by knock out strategies PTC and CE: 1960"
## [1] "hgnc_symbol empty string and NA tables:"
##
## FALSE <NA>
## 18654 4672
##
## FALSE TRUE
## 18654 4672
## [1] "PrS_nor"
## [1] "-----------------------------------"
## [1] "DE analysis group PrS_nor DESeq2 results:"
## [1] "-----------------------------------"
## [1] "After filtering by gene expression q75, the number of genes reduces from 25835 to 23493"
## [1] "After filtering by nonzero expression in at least 4 samples, the number of genes does not change."
## Time difference of 1.400877 mins
## [1] "prop of non-null for rd: 1.30e-01"
## [1] "DE group PrS_nor under KO gene POU2F3:"
## [1] "number of DE genes from knock out strategy CE: 2038"
## [1] "number of DE genes from knock out strategy KO: 590"
## [1] "number of DE genes from knock out strategy PTC: 8"
## [1] "number of common DE genes by knock out strategies CE and KO: 462"
## [1] "number of common DE genes by knock out strategies KO and PTC: 6"
## [1] "number of common DE genes by knock out strategies PTC and CE: 5"
## [1] "hgnc_symbol empty string and NA tables:"
##
## FALSE <NA>
## 18811 4682
##
## FALSE TRUE
## 18811 4682
## [1] "DE group PrS_nor under KO gene PPARG:"
## [1] "number of DE genes from knock out strategy CE: 4503"
## [1] "number of DE genes from knock out strategy KO: 3029"
## [1] "number of DE genes from knock out strategy PTC: 4014"
## [1] "number of common DE genes by knock out strategies CE and KO: 2690"
## [1] "number of common DE genes by knock out strategies KO and PTC: 2334"
## [1] "number of common DE genes by knock out strategies PTC and CE: 3173"
## [1] "hgnc_symbol empty string and NA tables:"
##
## FALSE <NA>
## 18811 4682
##
## FALSE TRUE
## 18811 4682
## [1] "DE group PrS_nor under KO gene GCM1:"
## [1] "number of DE genes from knock out strategy CE: 5299"
## [1] "number of DE genes from knock out strategy KO: 4488"
## [1] "number of DE genes from knock out strategy PTC: 4255"
## [1] "number of common DE genes by knock out strategies CE and KO: 4014"
## [1] "number of common DE genes by knock out strategies KO and PTC: 3770"
## [1] "number of common DE genes by knock out strategies PTC and CE: 3823"
## [1] "hgnc_symbol empty string and NA tables:"
##
## FALSE <NA>
## 18811 4682
##
## FALSE TRUE
## 18811 4682
## [1] "DE group PrS_nor under KO gene FOSB:"
## [1] "number of DE genes from knock out strategy CE: 121"
## [1] "number of DE genes from knock out strategy KO: 409"
## [1] "number of DE genes from knock out strategy PTC: 847"
## [1] "number of common DE genes by knock out strategies CE and KO: 43"
## [1] "number of common DE genes by knock out strategies KO and PTC: 324"
## [1] "number of common DE genes by knock out strategies PTC and CE: 103"
## [1] "hgnc_symbol empty string and NA tables:"
##
## FALSE <NA>
## 18811 4682
##
## FALSE TRUE
## 18811 4682
## [1] "DE group PrS_nor under KO gene EPAS1:"
## [1] "number of DE genes from knock out strategy CE: 1372"
## [1] "number of DE genes from knock out strategy KO: 40"
## [1] "number of DE genes from knock out strategy PTC: 304"
## [1] "number of common DE genes by knock out strategies CE and KO: 37"
## [1] "number of common DE genes by knock out strategies KO and PTC: 29"
## [1] "number of common DE genes by knock out strategies PTC and CE: 236"
## [1] "hgnc_symbol empty string and NA tables:"
##
## FALSE <NA>
## 18811 4682
##
## FALSE TRUE
## 18811 4682
## [1] "DE group PrS_nor under KO gene GRHL1:"
## [1] "number of DE genes from knock out strategy CE: 3439"
## [1] "number of DE genes from knock out strategy KO: 2483"
## [1] "number of DE genes from knock out strategy PTC: 5095"
## [1] "number of common DE genes by knock out strategies CE and KO: 1980"
## [1] "number of common DE genes by knock out strategies KO and PTC: 2044"
## [1] "number of common DE genes by knock out strategies PTC and CE: 3057"
## [1] "hgnc_symbol empty string and NA tables:"
##
## FALSE <NA>
## 18811 4682
##
## FALSE TRUE
## 18811 4682
colnames(df_size) = c("DE_group", "knockout_gene_strategy", "n_ko", "n_WT")
fwrite(df_size,
sprintf("%s/%s_DE_n_samples.tsv", output_dir, release_name),
sep="\t")gc()## used (Mb) gc trigger (Mb) limit (Mb) max used (Mb)
## Ncells 8146656 435.1 12621695 674.1 NA 12621695 674.1
## Vcells 40262056 307.2 101126503 771.6 65536 101126503 771.6
sessionInfo()## R version 4.2.3 (2023-03-15)
## Platform: x86_64-apple-darwin17.0 (64-bit)
## Running under: macOS Big Sur ... 10.16
##
## Matrix products: default
## BLAS: /Library/Frameworks/R.framework/Versions/4.2/Resources/lib/libRblas.0.dylib
## LAPACK: /Library/Frameworks/R.framework/Versions/4.2/Resources/lib/libRlapack.dylib
##
## locale:
## [1] en_US.UTF-8/en_US.UTF-8/en_US.UTF-8/C/en_US.UTF-8/en_US.UTF-8
##
## attached base packages:
## [1] grid stats4 stats graphics grDevices utils datasets
## [8] methods base
##
## other attached packages:
## [1] readxl_1.4.3 UpSetR_1.4.0
## [3] ComplexHeatmap_2.14.0 data.table_1.15.4
## [5] dplyr_1.1.4 ggpointdensity_0.1.0
## [7] viridis_0.6.5 viridisLite_0.4.2
## [9] DESeq2_1.38.3 SummarizedExperiment_1.28.0
## [11] Biobase_2.58.0 MatrixGenerics_1.10.0
## [13] matrixStats_1.3.0 GenomicRanges_1.50.2
## [15] GenomeInfoDb_1.34.9 IRanges_2.32.0
## [17] S4Vectors_0.36.2 BiocGenerics_0.44.0
## [19] RColorBrewer_1.1-3 MASS_7.3-60
## [21] stringr_1.5.1 ggpubr_0.6.0
## [23] ggrepel_0.9.5 ggplot2_3.5.1
##
## loaded via a namespace (and not attached):
## [1] bitops_1.0-8 bit64_4.0.5 doParallel_1.0.17
## [4] httr_1.4.7 tools_4.2.3 backports_1.5.0
## [7] bslib_0.8.0 utf8_1.2.4 R6_2.5.1
## [10] DBI_1.2.3 colorspace_2.1-1 GetoptLong_1.0.5
## [13] withr_3.0.1 tidyselect_1.2.1 gridExtra_2.3
## [16] bit_4.0.5 compiler_4.2.3 cli_3.6.3
## [19] Cairo_1.6-2 DelayedArray_0.24.0 labeling_0.4.3
## [22] sass_0.4.9 scales_1.3.0 digest_0.6.37
## [25] rmarkdown_2.28 XVector_0.38.0 pkgconfig_2.0.3
## [28] htmltools_0.5.8.1 highr_0.11 fastmap_1.2.0
## [31] GlobalOptions_0.1.2 rlang_1.1.4 rstudioapi_0.16.0
## [34] RSQLite_2.3.7 farver_2.1.2 shape_1.4.6.1
## [37] jquerylib_0.1.4 generics_0.1.3 jsonlite_1.8.8
## [40] BiocParallel_1.32.6 car_3.1-2 RCurl_1.98-1.16
## [43] magrittr_2.0.3 GenomeInfoDbData_1.2.9 Matrix_1.5-4.1
## [46] Rcpp_1.0.13 munsell_0.5.1 fansi_1.0.6
## [49] abind_1.4-5 lifecycle_1.0.4 stringi_1.8.4
## [52] yaml_2.3.10 carData_3.0-5 zlibbioc_1.44.0
## [55] plyr_1.8.9 blob_1.2.4 parallel_4.2.3
## [58] crayon_1.5.3 lattice_0.22-6 Biostrings_2.66.0
## [61] annotate_1.76.0 circlize_0.4.16 KEGGREST_1.38.0
## [64] locfit_1.5-9.10 knitr_1.48 pillar_1.9.0
## [67] rjson_0.2.21 ggsignif_0.6.4 geneplotter_1.76.0
## [70] codetools_0.2-20 XML_3.99-0.17 glue_1.7.0
## [73] evaluate_0.24.0 foreach_1.5.2 vctrs_0.6.5
## [76] png_0.1-8 cellranger_1.1.0 gtable_0.3.5
## [79] purrr_1.0.2 tidyr_1.3.1 clue_0.3-65
## [82] cachem_1.1.0 xfun_0.47 xtable_1.8-4
## [85] broom_1.0.6 rstatix_0.7.2 tibble_3.2.1
## [88] iterators_1.0.14 AnnotationDbi_1.60.2 memoise_2.0.1
## [91] cluster_2.1.6